{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "import textgrid\n",
    "from pydub import AudioSegment\n",
    "import ntpath\n",
    "from pathlib import Path, PurePath"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Helpers to parse time form textgrid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "grid=textgrid.TextGrid.fromFile('data/audio_google_aligned/down/00176480_nohash_0down.TextGrid')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['words', 'phones']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid.getNames()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "interval=grid.getFirst('words')[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Interval(0.0, 0.43, None)"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid.getList('words')[0][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'down'"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "interval.mark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.88"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "interval.maxTime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.43"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "interval.minTime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_word_interval(file_path, label):\n",
    "    grid=textgrid.TextGrid.fromFile(file_path)\n",
    "    words_list=grid.getList('words')[0]\n",
    "    for word in words_list:\n",
    "        if word.mark==label:\n",
    "            return word.minTime, word.maxTime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.43, 0.88)"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_word_interval('data/audio_google_aligned/down/00176480_nohash_0down.TextGrid', 'down')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "def segment_audio_file(wav_file_path, textgrid_file_path, to_folder, label):\n",
    "    start_sec, end_sec =get_word_interval(textgrid_file_path, label)\n",
    "    # Time to miliseconds \n",
    "    startTime = start_sec*1000\n",
    "    endTime = end_sec*1000\n",
    "    \n",
    "    # Opening file and extracting segment\n",
    "    song = AudioSegment.from_wav(wav_file_path)\n",
    "    extract = song[startTime:endTime]\n",
    "    \n",
    "    # Saving\n",
    "    filename=ntpath.basename(wav_file_path)\n",
    "    segment_wav_path=f'{to_folder}/{filename}'\n",
    "    extract.export(segment_wav_path, format=\"wav\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "def segment_audio_files(wav_path, text_grid_path, to_folder):\n",
    "    wav_files_path=list(Path(wav_path).rglob('*.wav'))\n",
    "    for i, wav_file_path in enumerate(wav_files_path):\n",
    "        if i%100==0:\n",
    "            print(f'working on file {i}')\n",
    "        pieces=PurePath(wav_file_path).parts\n",
    "        wav_text_grid_path=Path(text_grid_path)/pieces[-2]/pieces[-1].replace('.wav', '.TextGrid')\n",
    "        wav_to_folder=Path(to_folder)/pieces[-2]\n",
    "        wav_to_folder=str(wav_to_folder)\n",
    "        segment_audio_file(wav_file_path, wav_text_grid_path, wav_to_folder, pieces[-2])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "working on file 0\n",
      "working on file 100\n",
      "working on file 200\n",
      "working on file 300\n",
      "working on file 400\n",
      "working on file 500\n",
      "working on file 600\n",
      "working on file 700\n",
      "working on file 800\n",
      "working on file 900\n",
      "working on file 1000\n",
      "working on file 1100\n",
      "working on file 1200\n",
      "working on file 1300\n",
      "working on file 1400\n",
      "working on file 1500\n",
      "working on file 1600\n",
      "working on file 1700\n",
      "working on file 1800\n",
      "working on file 1900\n",
      "working on file 2000\n",
      "working on file 2100\n",
      "working on file 2200\n",
      "working on file 2300\n",
      "working on file 2400\n",
      "working on file 2500\n",
      "working on file 2600\n",
      "working on file 2700\n",
      "working on file 2800\n",
      "working on file 2900\n",
      "working on file 3000\n",
      "working on file 3100\n",
      "working on file 3200\n",
      "working on file 3300\n",
      "working on file 3400\n",
      "working on file 3500\n",
      "working on file 3600\n",
      "working on file 3700\n",
      "working on file 3800\n",
      "working on file 3900\n",
      "working on file 4000\n",
      "working on file 4100\n",
      "working on file 4200\n",
      "working on file 4300\n",
      "working on file 4400\n",
      "working on file 4500\n",
      "working on file 4600\n",
      "working on file 4700\n",
      "working on file 4800\n",
      "working on file 4900\n",
      "working on file 5000\n",
      "working on file 5100\n",
      "working on file 5200\n",
      "working on file 5300\n",
      "working on file 5400\n",
      "working on file 5500\n",
      "working on file 5600\n",
      "working on file 5700\n",
      "working on file 5800\n",
      "working on file 5900\n",
      "working on file 6000\n",
      "working on file 6100\n",
      "working on file 6200\n",
      "working on file 6300\n",
      "working on file 6400\n",
      "working on file 6500\n",
      "working on file 6600\n",
      "working on file 6700\n",
      "working on file 6800\n",
      "working on file 6900\n",
      "working on file 7000\n",
      "working on file 7100\n",
      "working on file 7200\n",
      "working on file 7300\n",
      "working on file 7400\n",
      "working on file 7500\n",
      "working on file 7600\n",
      "working on file 7700\n",
      "working on file 7800\n",
      "working on file 7900\n",
      "working on file 8000\n",
      "working on file 8100\n",
      "working on file 8200\n",
      "working on file 8300\n",
      "working on file 8400\n",
      "working on file 8500\n",
      "working on file 8600\n",
      "working on file 8700\n",
      "working on file 8800\n",
      "working on file 8900\n",
      "working on file 9000\n",
      "working on file 9100\n",
      "working on file 9200\n",
      "working on file 9300\n",
      "working on file 9400\n",
      "working on file 9500\n",
      "working on file 9600\n",
      "working on file 9700\n",
      "working on file 9800\n",
      "working on file 9900\n",
      "working on file 10000\n",
      "working on file 10100\n",
      "working on file 10200\n",
      "working on file 10300\n",
      "working on file 10400\n",
      "working on file 10500\n",
      "working on file 10600\n",
      "working on file 10700\n",
      "working on file 10800\n",
      "working on file 10900\n",
      "working on file 11000\n",
      "working on file 11100\n",
      "working on file 11200\n",
      "working on file 11300\n",
      "working on file 11400\n",
      "working on file 11500\n",
      "working on file 11600\n",
      "working on file 11700\n",
      "working on file 11800\n",
      "working on file 11900\n",
      "working on file 12000\n",
      "working on file 12100\n",
      "working on file 12200\n",
      "working on file 12300\n",
      "working on file 12400\n",
      "working on file 12500\n",
      "working on file 12600\n",
      "working on file 12700\n",
      "working on file 12800\n",
      "working on file 12900\n",
      "working on file 13000\n",
      "working on file 13100\n",
      "working on file 13200\n",
      "working on file 13300\n",
      "working on file 13400\n",
      "working on file 13500\n",
      "working on file 13600\n",
      "working on file 13700\n",
      "working on file 13800\n",
      "working on file 13900\n",
      "working on file 14000\n",
      "working on file 14100\n",
      "working on file 14200\n",
      "working on file 14300\n",
      "working on file 14400\n",
      "working on file 14500\n",
      "working on file 14600\n",
      "working on file 14700\n",
      "working on file 14800\n",
      "working on file 14900\n",
      "working on file 15000\n",
      "working on file 15100\n",
      "working on file 15200\n",
      "working on file 15300\n",
      "working on file 15400\n",
      "working on file 15500\n",
      "working on file 15600\n",
      "working on file 15700\n",
      "working on file 15800\n",
      "working on file 15900\n",
      "working on file 16000\n",
      "working on file 16100\n",
      "working on file 16200\n",
      "working on file 16300\n",
      "working on file 16400\n",
      "working on file 16500\n",
      "working on file 16600\n",
      "working on file 16700\n",
      "working on file 16800\n",
      "working on file 16900\n",
      "working on file 17000\n",
      "working on file 17100\n",
      "working on file 17200\n",
      "working on file 17300\n",
      "working on file 17400\n",
      "working on file 17500\n",
      "working on file 17600\n",
      "working on file 17700\n",
      "working on file 17800\n",
      "working on file 17900\n",
      "working on file 18000\n",
      "working on file 18100\n",
      "working on file 18200\n",
      "working on file 18300\n",
      "working on file 18400\n",
      "working on file 18500\n",
      "working on file 18600\n",
      "working on file 18700\n",
      "working on file 18800\n",
      "working on file 18900\n",
      "Wall time: 5min 44s\n"
     ]
    }
   ],
   "source": [
    "%time segment_audio_files('data/audio_google4alignment/', 'data/audio_google_aligned/', 'data/audio_google_aligned/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
